Please see the paper, vignette, manual, and vst notebook for much greater detail on specific aspects of DESeq2.

This notebook is derived from the normalization run on microbiome data .

libraries

reqpkg <- c("DESeq2","dplyr","magrittr","DT","biomaRt","vsn","pheatmap","RColorBrewer","ggplot2","ggpubr")
for (i in reqpkg) {
    print(i)
    print(packageVersion(i))
    suppressWarnings(suppressMessages(library(i, quietly=T, verbose=T, warn.conflicts=F,character.only=T)))
}
## [1] "DESeq2"
## [1] '1.26.0'
## [1] "dplyr"
## [1] '0.8.5'
## [1] "magrittr"
## [1] '1.5'
## [1] "DT"
## [1] '0.13'
## [1] "biomaRt"
## [1] '2.42.1'
## [1] "vsn"
## [1] '3.54.0'
## [1] "pheatmap"
## [1] '1.0.12'
## [1] "RColorBrewer"
## [1] '1.1.2'
## [1] "ggplot2"
## [1] '3.3.0'
## [1] "ggpubr"
## [1] '0.3.0'
theme_set(theme_pubr())
splits <- list("WTSPF"=1:18, "KOSPF"=19:36, "WTGF"=37:54, "KOGF"=55:72)

import data

# for (i in 1:10) {
#   try({
#     mouseProteinCodingGenes <- useMart("ensembl") %>%
#       useDataset("mmusculus_gene_ensembl",mart=.) %>%
#       getBM(attributes=c("ensembl_gene_id","external_gene_name","description"), filters='biotype', values=c('protein_coding'), mart=.)
#     break
#   }, silent = T)
#   Sys.sleep(2)
# }

df <- read.csv2("../../../KF_RNASeq_counts_filtered.txt", sep = ",") %>% 
  dplyr::select(-X) %>%
  # .[rowSums(sapply(., '%in%', mouseProteinCodingGenes$ensembl_gene_id)) > 0,] %>% 
  set_rownames(.$Geneid) %>% 
  dplyr::select(-Geneid)

WT SPF

df[, splits$WTSPF] %>% head(20) %>%  datatable(filter = "bottom")

KO SPF

df[, splits$KOSPF] %>% head(20) %>%  datatable(filter = "bottom")

WT GF

df[, splits$WTGF] %>% head(20) %>%  datatable(filter = "bottom")

KO GF

df[, splits$KOGF] %>% head(20) %>%  datatable(filter = "bottom")

generate DESeq object

condition_list <- data.frame(row.names=colnames(df), 
                             Genotype=rep(c(rep("WT",18), rep("KO",18)),2), 
                             Condition=c(rep("SPF",36),rep("GF",36)))
condition_list$Genotype <- relevel(condition_list$Genotype, "WT")
condition_list$Condition <- relevel(condition_list$Condition, "SPF")
dds <- DESeq2::DESeqDataSetFromMatrix(countData=df,
                                      colData = condition_list,
                                      design = ~ Genotype + Condition + Genotype:Condition)

head(dds)
## class: DESeqDataSet 
## dim: 6 72 
## metadata(1): version
## assays(1): counts
## rownames(6): ENSMUSG00000102693 ENSMUSG00000064842 ...
##   ENSMUSG00000103377 ENSMUSG00000104017
## rowData names(0):
## colnames(72): KF_01 KF_02 ... KF_71 KF_72
## colData names(2): Genotype Condition
dds <- estimateSizeFactors(dds)

# Creating DESeqTransform object from raw data to compare with transformations downstream
dds_dud <- SummarizedExperiment(counts(dds), colData = colData(dds)) %>%
  DESeqTransform()

normalization

I’ll attempt 4 types of normalization by DESeq2

  • log2 normalized to estimated size factors (median of the ratios)
  • variance stabilization
  • regularized log
  • log10 scaling only

Of these, all besides log10 scaling are normalizations that consider library size.

log2 normalized to estimated size factors (ntd)

This is \(\log_2 (n + 1)\), where \(n\) is normalized to estimated size factors. However, since counts data tends to prioritize available normalization factors before defaulting to estimated size factors, I’ll first check the available normalization factors and estimated size factors in the DESeqData object.
The result of this function are psuedocounts, basically just scaled counts.

normalizationFactors(dds)
## NULL
sizeFactors(dds) %>% head()
##    KF_01    KF_02    KF_03    KF_04    KF_05    KF_06 
## 1.098547 1.133832 1.032688 1.088866 1.117984 1.110088
ntd <- normTransform(dds)
ntd_assay <- assay(ntd)
colData(ntd)
## DataFrame with 72 rows and 3 columns
##       Genotype Condition        sizeFactor
##       <factor>  <factor>         <numeric>
## KF_01       WT       SPF  1.09854743602553
## KF_02       WT       SPF  1.13383224370949
## KF_03       WT       SPF  1.03268839468746
## KF_04       WT       SPF  1.08886640783561
## KF_05       WT       SPF  1.11798388435345
## ...        ...       ...               ...
## KF_68       KO        GF  0.89792789641738
## KF_69       KO        GF  1.03466716910215
## KF_70       KO        GF 0.994248301676296
## KF_71       KO        GF 0.896223923614269
## KF_72       KO        GF 0.976403314999965

WT SPF

ntd_assay[, splits$WTSPF] %>% head(20) %>% datatable(filter = "bottom")

KO SPF

ntd_assay[, splits$KOSPF] %>% head(20) %>% datatable(filter = "bottom")

WT GF

ntd_assay[, splits$WTGF] %>% head(20) %>% datatable(filter = "bottom")

KO GF

ntd_assay[, splits$KOGF] %>% head(20) %>% datatable(filter = "bottom")

variance stabilized transformation (vst)

Conceptually, vst transforms data to make it homoscedastic, that is the variance across ranks (i.e each sample) should be stabilized.

DESeq2 requires > 10000 items/sample to run vst, the subset and faster version of variance stabilization. We need to use varianceStabilizingTransformation because there are orders of magnitude less items/sample in 16S data.

vsd <- varianceStabilizingTransformation(dds)
vsd_assay <- assay(vsd)
colData(vsd)
## DataFrame with 72 rows and 3 columns
##       Genotype Condition        sizeFactor
##       <factor>  <factor>         <numeric>
## KF_01       WT       SPF  1.09854743602553
## KF_02       WT       SPF  1.13383224370949
## KF_03       WT       SPF  1.03268839468746
## KF_04       WT       SPF  1.08886640783561
## KF_05       WT       SPF  1.11798388435345
## ...        ...       ...               ...
## KF_68       KO        GF  0.89792789641738
## KF_69       KO        GF  1.03466716910215
## KF_70       KO        GF 0.994248301676296
## KF_71       KO        GF 0.896223923614269
## KF_72       KO        GF 0.976403314999965

WT SPF

vsd_assay[, splits$WTSPF] %>% head(20) %>% datatable(filter = "bottom")

KO SPF

vsd_assay[, splits$KOSPF] %>% head(20) %>% datatable(filter = "bottom")

WT GF

vsd_assay[, splits$WTGF] %>% head(20) %>% datatable(filter = "bottom")

KO GF

vsd_assay[, splits$KOGF] %>% head(20) %>% datatable(filter = "bottom")

regularized log transformation (rlog)

This is similar to \(\log_2 (n + n_0)\), where \(n\) is the counts and \(n_0\) is a positive constant. Here, \(n_0\) is varied based on the dispersion-mean trend, in the equation (taken from the DESeq2 vignette, refer to vignette for more details) \(\log_2 (q_{ij} = \beta_{i0} + \beta_{ij})\).

rld <- rlog(dds)
## rlog() may take a long time with 50 or more samples,
## vst() is a much faster transformation
rld_assay <- assay(rld)
colData(rld)
## DataFrame with 72 rows and 3 columns
##       Genotype Condition        sizeFactor
##       <factor>  <factor>         <numeric>
## KF_01       WT       SPF  1.09854743602553
## KF_02       WT       SPF  1.13383224370949
## KF_03       WT       SPF  1.03268839468746
## KF_04       WT       SPF  1.08886640783561
## KF_05       WT       SPF  1.11798388435345
## ...        ...       ...               ...
## KF_68       KO        GF  0.89792789641738
## KF_69       KO        GF  1.03466716910215
## KF_70       KO        GF 0.994248301676296
## KF_71       KO        GF 0.896223923614269
## KF_72       KO        GF 0.976403314999965

WT SPF

rld_assay[, splits$WTSPF] %>% head(20) %>% datatable(filter = "bottom")

KO SPF

rld_assay[, splits$KOSPF] %>% head(20) %>% datatable(filter = "bottom")

WT GF

rld_assay[, splits$WTGF] %>% head(20) %>% datatable(filter = "bottom")

KO GF

rld_assay[, splits$KOGF] %>% head(20) %>% datatable(filter = "bottom")

log10

I’ll create a custom DESeqTransform object from log-transformed counts.

esf <- SummarizedExperiment(log(counts(dds)+1), colData = colData(dds)) %>%
  DESeqTransform()
esf_assay <- assay(esf)
colData(esf)
## DataFrame with 72 rows and 3 columns
##       Genotype Condition        sizeFactor
##       <factor>  <factor>         <numeric>
## KF_01       WT       SPF  1.09854743602553
## KF_02       WT       SPF  1.13383224370949
## KF_03       WT       SPF  1.03268839468746
## KF_04       WT       SPF  1.08886640783561
## KF_05       WT       SPF  1.11798388435345
## ...        ...       ...               ...
## KF_68       KO        GF  0.89792789641738
## KF_69       KO        GF  1.03466716910215
## KF_70       KO        GF 0.994248301676296
## KF_71       KO        GF 0.896223923614269
## KF_72       KO        GF 0.976403314999965

WT SPF

esf_assay[, splits$WTSPF] %>% head(20) %>% datatable(filter = "bottom")

KO SPF

esf_assay[, splits$KOSPF] %>% head(20) %>% datatable(filter = "bottom")

WT GF

esf_assay[, splits$WTGF] %>% head(20) %>% datatable(filter = "bottom")

KO GF

esf_assay[, splits$KOGF] %>% head(20) %>% datatable(filter = "bottom")

visualization and comparisons of normalizations

hexbin plots

These graphs should help to understand how each transformation is affecting variance in the dataset. Standard deviation is graphed against means. Ideally, variance would be constant for different means and should result in a flat curve. Generally, variance shouldn’t be dependent on the mean.

raw data

p0 <- dds_dud %>% assay() %>% meanSdPlot()

ntd

p1 <- ntd_assay %>% meanSdPlot()

vst

p2 <- vsd_assay %>% meanSdPlot()

rlog

p3 <- rld_assay %>% meanSdPlot()

log10

p4 <- esf_assay %>% meanSdPlot()

count matrix

Taking top 20 counts/estimatedSizeFactors, and generating a pheatmap annotation

select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]
anno <- as.data.frame(colData(dds)[,c("Genotype","Condition")])

raw data

pheatmap(assay(dds_dud)[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=anno)

ntd

pheatmap(ntd_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=anno)

vsd

pheatmap(vsd_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=anno)

rlog

pheatmap(rld_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=anno)

log10

pheatmap(esf_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=anno)

sample-to-sample distance

Procuring a distance matric. It portrays how similar samples are to each others. Darker blue = more similar.

raw data

sampleDists <- dist(t(assay(dds_dud)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(dds_dud$Genotype, dds_dud$Condition, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         show_rownames = F)

ntd

sampleDists <- dist(t(ntd_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(ntd$Genotype, ntd$Condition, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         show_rownames = F)

vsd

sampleDists <- dist(t(vsd_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Genotype, vsd$Condition, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         show_rownames = F)

rlog

sampleDists <- dist(t(rld_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$Genotype, rld$Condition, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         show_rownames = F)

log10

sampleDists <- dist(t(esf_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(esf$Genotype, esf$Condition, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         show_rownames = F)

PCA

raw data

# plotPCA(dds_dud, intgroup=c("Time"))
plotPCA(dds_dud, intgroup=c("Genotype","Condition"))

ntd

# plotPCA(ntd, intgroup=c("Time"))
plotPCA(ntd, intgroup=c("Genotype","Condition"))

vst

# plotPCA(vsd, intgroup=c("Time"))
plotPCA(vsd, intgroup=c("Genotype", "Condition"))

rlog

# plotPCA(rld, intgroup=c("Time"))
plotPCA(rld, intgroup=c("Genotype", "Condition"))

log10

# plotPCA(esf, intgroup=c("Time"))
plotPCA(esf, intgroup=c("Genotype","Condition"))

conclusions

All normalizations led to the same categorization by a blinded pheatmap in heatmaps and PCA, suggesting that trends in data were not seriously impacted by normalization methods, and that trends were robust. log10 and ntd both had worse homoscedasticity than the rest, but ntd had the greatest sample distances, closely followed by log10, vst, and rlog in that order. vst had the best homoscedasticity.
Thus, vst may be the best normalization to use for the data.

Saving vst data to csv file.

write.csv2(vsd_assay, "featurecounts_filtered_vstnormalized.csv", sep = "\t")
## Warning in write.csv2(vsd_assay, "featurecounts_filtered_vstnormalized.csv", :
## attempt to set 'sep' ignored

session info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux 7.4 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /software/openblas-0.2.19-el7-x86_64/lib/libopenblas_haswellp-r0.2.19.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.3.0                ggplot2_3.3.0              
##  [3] RColorBrewer_1.1-2          pheatmap_1.0.12            
##  [5] vsn_3.54.0                  biomaRt_2.42.1             
##  [7] DT_0.13                     magrittr_1.5               
##  [9] dplyr_0.8.5                 DESeq2_1.26.0              
## [11] SummarizedExperiment_1.16.1 DelayedArray_0.12.3        
## [13] BiocParallel_1.20.1         matrixStats_0.56.0         
## [15] Biobase_2.46.0              GenomicRanges_1.38.0       
## [17] GenomeInfoDb_1.22.1         IRanges_2.20.2             
## [19] S4Vectors_0.24.4            BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       ggsignif_0.6.0         ellipsis_0.2.0.1      
##   [4] rio_0.5.16             htmlTable_1.13.3       XVector_0.26.0        
##   [7] base64enc_0.1-3        rstudioapi_0.10        hexbin_1.27.3         
##  [10] farver_2.0.1           affyio_1.56.0          bit64_0.9-7           
##  [13] AnnotationDbi_1.48.0   splines_3.6.1          geneplotter_1.64.0    
##  [16] knitr_1.28             jsonlite_1.6           Formula_1.2-3         
##  [19] broom_0.5.6            annotate_1.64.0        cluster_2.1.0         
##  [22] dbplyr_1.4.2           shiny_1.3.2            BiocManager_1.30.10   
##  [25] compiler_3.6.1         httr_1.4.0             backports_1.1.4       
##  [28] assertthat_0.2.1       Matrix_1.2-18          limma_3.42.2          
##  [31] later_0.8.0            acepack_1.4.1          htmltools_0.3.6       
##  [34] prettyunits_1.0.2      tools_3.6.1            gtable_0.3.0          
##  [37] glue_1.3.1             GenomeInfoDbData_1.2.2 affy_1.64.0           
##  [40] rappdirs_0.3.1         Rcpp_1.0.3             carData_3.0-3         
##  [43] cellranger_1.1.0       vctrs_0.3.0            preprocessCore_1.48.0 
##  [46] nlme_3.1-140           crosstalk_1.0.0        xfun_0.13             
##  [49] stringr_1.4.0          openxlsx_4.1.5         mime_0.7              
##  [52] lifecycle_0.2.0        rstatix_0.5.0          XML_3.99-0.3          
##  [55] zlibbioc_1.32.0        scales_1.1.0           promises_1.0.1        
##  [58] hms_0.5.3              yaml_2.2.0             curl_3.3              
##  [61] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [64] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.2.0         
##  [67] genefilter_1.68.0      checkmate_2.0.0        zip_2.0.4             
##  [70] rlang_0.4.6            pkgconfig_2.0.2        bitops_1.0-6          
##  [73] evaluate_0.14          lattice_0.20-38        purrr_0.3.2           
##  [76] labeling_0.3           htmlwidgets_1.3        bit_1.1-15.2          
##  [79] tidyselect_1.1.0       R6_2.4.0               generics_0.0.2        
##  [82] Hmisc_4.2-0            DBI_1.1.0              pillar_1.4.4          
##  [85] haven_2.2.0            foreign_0.8-71         withr_2.1.2           
##  [88] abind_1.4-5            survival_3.1-12        RCurl_1.98-1.2        
##  [91] nnet_7.3-12            tibble_3.0.1           crayon_1.3.4          
##  [94] car_3.0-7              BiocFileCache_1.10.2   rmarkdown_1.13        
##  [97] progress_1.2.2         readxl_1.3.1           locfit_1.5-9.4        
## [100] grid_3.6.1             data.table_1.12.8      blob_1.2.1            
## [103] forcats_0.5.0          digest_0.6.20          xtable_1.8-4          
## [106] httpuv_1.5.1           tidyr_1.0.3            openssl_1.4           
## [109] munsell_0.5.0          askpass_1.1
 

By Sumeed Yoyo Manzoor

smanzoor@uchicago.edu



Back to top